rm(list = ls())
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.6.8 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
##
## complete
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(ggcorrplot)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded glmnet 2.0-16
##
## Attaching package: 'glmnet'
## The following object is masked from 'package:pROC':
##
## auc
library(plotly)
set.seed(1)
The goal for our final project was to build a prediction model using the LendingClub loans data set. We wanted to build a model that could predict the a dichotomized outcome of loan status (which will be explained in more detail below) using the variables given in the data set. In this web page, we will walk you through our process for building our final prediction model.
Lara, can you explain where and how you got the data set?
setwd('/Users/shuheimiyasaka/Google Drive/Harvard/Courses/BST 260/BST-260-Final-Project')
load('loan.Rdata')
#loan.dat <- read.csv('loan.csv', header = TRUE)
#save(loan.dat, file = "./loan.RData")
#extract.3000 <- sample(1:dim(loan.dat)[1], 3000, replace = FALSE)
#write.csv(loan.dat[extract.3000, ], './loan_subset3000.csv')
The data set has 887,379 records with 74 variables.
dim(loan.dat)
## [1] 887379 74
names(loan.dat)
## [1] "id" "member_id"
## [3] "loan_amnt" "funded_amnt"
## [5] "funded_amnt_inv" "term"
## [7] "int_rate" "installment"
## [9] "grade" "sub_grade"
## [11] "emp_title" "emp_length"
## [13] "home_ownership" "annual_inc"
## [15] "verification_status" "issue_d"
## [17] "loan_status" "pymnt_plan"
## [19] "url" "desc"
## [21] "purpose" "title"
## [23] "zip_code" "addr_state"
## [25] "dti" "delinq_2yrs"
## [27] "earliest_cr_line" "inq_last_6mths"
## [29] "mths_since_last_delinq" "mths_since_last_record"
## [31] "open_acc" "pub_rec"
## [33] "revol_bal" "revol_util"
## [35] "total_acc" "initial_list_status"
## [37] "out_prncp" "out_prncp_inv"
## [39] "total_pymnt" "total_pymnt_inv"
## [41] "total_rec_prncp" "total_rec_int"
## [43] "total_rec_late_fee" "recoveries"
## [45] "collection_recovery_fee" "last_pymnt_d"
## [47] "last_pymnt_amnt" "next_pymnt_d"
## [49] "last_credit_pull_d" "collections_12_mths_ex_med"
## [51] "mths_since_last_major_derog" "policy_code"
## [53] "application_type" "annual_inc_joint"
## [55] "dti_joint" "verification_status_joint"
## [57] "acc_now_delinq" "tot_coll_amt"
## [59] "tot_cur_bal" "open_acc_6m"
## [61] "open_il_6m" "open_il_12m"
## [63] "open_il_24m" "mths_since_rcnt_il"
## [65] "total_bal_il" "il_util"
## [67] "open_rv_12m" "open_rv_24m"
## [69] "max_bal_bc" "all_util"
## [71] "total_rev_hi_lim" "inq_fi"
## [73] "total_cu_tl" "inq_last_12m"
meta_loans <- funModeling::df_status(loan.dat, print_results = FALSE)
meta_loans[order(-meta_loans$p_na),]
As part of data exploration, we examined with percentage of “zeros”, missing records, and unique values in our data set per variable as shown above. From the table above, we notice quite a variables with a lot of missing data.
Based on examining the data set and reading the data dictionary, we decided to immediately rule out the following variables from our model: id, member_id, url, and desc.
cols.2.remove <- c('id', 'member_id', 'url', 'desc')
We decided to exclude variables with more than 10% missing data (19 variables).
missing.data.col <- meta_loans$variable[meta_loans$p_na > 10.]
missing.data.col
## [1] "mths_since_last_delinq" "mths_since_last_record"
## [3] "mths_since_last_major_derog" "annual_inc_joint"
## [5] "dti_joint" "open_acc_6m"
## [7] "open_il_6m" "open_il_12m"
## [9] "open_il_24m" "mths_since_rcnt_il"
## [11] "total_bal_il" "il_util"
## [13] "open_rv_12m" "open_rv_24m"
## [15] "max_bal_bc" "all_util"
## [17] "inq_fi" "total_cu_tl"
## [19] "inq_last_12m"
length(missing.data.col)
## [1] 19
cols.2.remove <- c(cols.2.remove, missing.data.col)
meta_loans[order(meta_loans$unique),]
cols.2.remove <- c(cols.2.remove, 'policy_code')
We also decided to remove policy_code since it only has one unique value.
At this point, we had 50 potential covariates:
cols.2.keep <- !(colnames(loan.dat) %in% cols.2.remove)
colnames(loan.dat)[cols.2.keep]
## [1] "loan_amnt" "funded_amnt"
## [3] "funded_amnt_inv" "term"
## [5] "int_rate" "installment"
## [7] "grade" "sub_grade"
## [9] "emp_title" "emp_length"
## [11] "home_ownership" "annual_inc"
## [13] "verification_status" "issue_d"
## [15] "loan_status" "pymnt_plan"
## [17] "purpose" "title"
## [19] "zip_code" "addr_state"
## [21] "dti" "delinq_2yrs"
## [23] "earliest_cr_line" "inq_last_6mths"
## [25] "open_acc" "pub_rec"
## [27] "revol_bal" "revol_util"
## [29] "total_acc" "initial_list_status"
## [31] "out_prncp" "out_prncp_inv"
## [33] "total_pymnt" "total_pymnt_inv"
## [35] "total_rec_prncp" "total_rec_int"
## [37] "total_rec_late_fee" "recoveries"
## [39] "collection_recovery_fee" "last_pymnt_d"
## [41] "last_pymnt_amnt" "next_pymnt_d"
## [43] "last_credit_pull_d" "collections_12_mths_ex_med"
## [45] "application_type" "verification_status_joint"
## [47] "acc_now_delinq" "tot_coll_amt"
## [49] "tot_cur_bal" "total_rev_hi_lim"
length(colnames(loan.dat)[cols.2.keep])
## [1] 50
loan.dat <- loan.dat[, cols.2.keep]
We also decided to remove 6 records with missing or zero annual income since this is a covariate we definitely wanted to include in our model (and didn’t feel we could impute these values properly)!
query = loan.dat$annual_inc == 0.
query.na = is.na(query)
if (sum(query.na) > 0){
query[query.na] = TRUE
}
if (sum(query) > 0){
loan.dat = loan.dat[!query,]
} else stop('unexpected case')
We the remaining set of records and covariates, we decided to examine the pairwise correlation of covariates:
meta_loans <- funModeling::df_status(loan.dat, print_results = FALSE)
numeric_cols <- meta_loans$variable[meta_loans$type == 'numeric']
cor.dat <- cor(loan.dat[,numeric_cols], loan.dat[,numeric_cols])
plot_ly(x=colnames(cor.dat),
y=rownames(cor.dat),
z = cor.dat, type = "heatmap", colorscale="Greys")
#ggcorrplot(cor(loan.dat[,numeric_cols]))
#aggr(loan.dat, combined=T, cex.axis=0.6)
We also calculated basic summary statistics of our covariates to help us better understand the data:
summary(loan.dat)
## loan_amnt funded_amnt funded_amnt_inv term
## Min. : 500 Min. : 500 Min. : 0 36 months:621119
## 1st Qu.: 8000 1st Qu.: 8000 1st Qu.: 8000 60 months:266254
## Median :13000 Median :13000 Median :13000
## Mean :14755 Mean :14742 Mean :14703
## 3rd Qu.:20000 3rd Qu.:20000 3rd Qu.:20000
## Max. :35000 Max. :35000 Max. :35000
##
## int_rate installment grade sub_grade
## Min. : 5.32 Min. : 15.67 A:148198 B3 : 56323
## 1st Qu.: 9.99 1st Qu.: 260.71 B:254535 B4 : 55626
## Median :12.99 Median : 382.55 C:245859 C1 : 53387
## Mean :13.25 Mean : 436.72 D:139541 C2 : 52235
## 3rd Qu.:16.20 3rd Qu.: 572.60 E: 70705 C3 : 50161
## Max. :28.99 Max. :1445.46 F: 23046 C4 : 48857
## G: 5489 (Other):570784
## emp_title emp_length home_ownership
## : 51451 10+ years:291569 ANY : 3
## Teacher : 13469 2 years : 78870 MORTGAGE:443555
## Manager : 11240 < 1 year : 70601 NONE : 46
## Registered Nurse: 5525 3 years : 70026 OTHER : 182
## Owner : 5376 1 year : 57095 OWN : 87470
## RN : 5355 5 years : 55704 RENT :356117
## (Other) :794957 (Other) :263508
## annual_inc verification_status issue_d
## Min. : 1200 Not Verified :266744 Oct-2015: 48631
## 1st Qu.: 45000 Source Verified:329558 Jul-2015: 45962
## Median : 65000 Verified :291071 Dec-2015: 44341
## Mean : 75028 Oct-2014: 38782
## 3rd Qu.: 90000 Nov-2015: 37529
## Max. :9500000 Aug-2015: 35886
## (Other) :636242
## loan_status pymnt_plan purpose
## Current :601777 n:887363 debt_consolidation:524214
## Fully Paid :207723 y: 10 credit_card :206181
## Charged Off : 45248 home_improvement : 51829
## Late (31-120 days): 11591 other : 42890
## Issued : 8460 major_purchase : 17277
## In Grace Period : 6253 small_business : 10377
## (Other) : 6321 (Other) : 34605
## title zip_code addr_state
## Debt consolidation :414000 945xx : 9770 CA :129517
## Credit card refinancing:164330 750xx : 9417 NY : 74082
## Home improvement : 40112 112xx : 9272 TX : 71136
## Other : 31892 606xx : 8641 FL : 60935
## Debt Consolidation : 15760 300xx : 8126 IL : 35476
## Major purchase : 12051 100xx : 7605 NJ : 33256
## (Other) :209228 (Other):834542 (Other):482971
## dti delinq_2yrs earliest_cr_line inq_last_6mths
## Min. : 0.00 Min. : 0.0000 Aug-2001: 6659 Min. : 0.0000
## 1st Qu.: 11.91 1st Qu.: 0.0000 Aug-2000: 6529 1st Qu.: 0.0000
## Median : 17.65 Median : 0.0000 Oct-2000: 6322 Median : 0.0000
## Mean : 18.13 Mean : 0.3144 Oct-2001: 6154 Mean : 0.6946
## 3rd Qu.: 23.95 3rd Qu.: 0.0000 Aug-2002: 6086 3rd Qu.: 1.0000
## Max. :1092.52 Max. :39.0000 Sep-2000: 5918 Max. :33.0000
## NA's :25 (Other) :849705 NA's :25
## open_acc pub_rec revol_bal revol_util
## Min. : 0.00 Min. : 0.0000 Min. : 0 Min. : 0.00
## 1st Qu.: 8.00 1st Qu.: 0.0000 1st Qu.: 6443 1st Qu.: 37.70
## Median :11.00 Median : 0.0000 Median : 11875 Median : 56.00
## Mean :11.55 Mean : 0.1953 Mean : 16921 Mean : 55.07
## 3rd Qu.:14.00 3rd Qu.: 0.0000 3rd Qu.: 20829 3rd Qu.: 73.60
## Max. :90.00 Max. :86.0000 Max. :2904836 Max. :892.30
## NA's :25 NA's :25 NA's :498
## total_acc initial_list_status out_prncp out_prncp_inv
## Min. : 1.00 f:456843 Min. : 0 Min. : 0
## 1st Qu.: 17.00 w:430530 1st Qu.: 0 1st Qu.: 0
## Median : 24.00 Median : 6459 Median : 6456
## Mean : 25.27 Mean : 8403 Mean : 8400
## 3rd Qu.: 32.00 3rd Qu.:13659 3rd Qu.:13654
## Max. :169.00 Max. :49373 Max. :49373
## NA's :25
## total_pymnt total_pymnt_inv total_rec_prncp total_rec_int
## Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 1915 1st Qu.: 1900 1st Qu.: 1201 1st Qu.: 441.5
## Median : 4895 Median : 4862 Median : 3215 Median : 1073.3
## Mean : 7559 Mean : 7521 Mean : 5758 Mean : 1754.8
## 3rd Qu.:10617 3rd Qu.:10566 3rd Qu.: 8000 3rd Qu.: 2238.3
## Max. :57778 Max. :57778 Max. :35000 Max. :24205.6
##
## total_rec_late_fee recoveries collection_recovery_fee
## Min. : 0.0000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.0000 Median : 0.00 Median : 0.000
## Mean : 0.3967 Mean : 45.92 Mean : 4.881
## 3rd Qu.: 0.0000 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :358.6800 Max. :33520.27 Max. :7002.190
##
## last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d
## Jan-2016:470148 Min. : 0.0 Feb-2016:553404 Jan-2016:730572
## Dec-2015:150861 1st Qu.: 280.2 :252971 Dec-2015: 19308
## : 17659 Median : 462.8 Jan-2016: 78195 Nov-2015: 11490
## Oct-2015: 16000 Mean : 2164.2 Mar-2011: 107 Oct-2015: 10419
## Jul-2015: 14483 3rd Qu.: 831.2 Apr-2011: 101 Sep-2015: 10087
## Nov-2015: 13981 Max. :36475.6 Feb-2011: 91 Jul-2015: 8642
## (Other) :204241 (Other) : 2504 (Other) : 96855
## collections_12_mths_ex_med application_type
## Min. : 0.00000 INDIVIDUAL:886864
## 1st Qu.: 0.00000 JOINT : 509
## Median : 0.00000
## Mean : 0.01438
## 3rd Qu.: 0.00000
## Max. :20.00000
## NA's :141
## verification_status_joint acc_now_delinq tot_coll_amt
## :886864 Min. : 0.000000 Min. : 0
## Not Verified : 281 1st Qu.: 0.000000 1st Qu.: 0
## Source Verified: 61 Median : 0.000000 Median : 0
## Verified : 167 Mean : 0.004991 Mean : 226
## 3rd Qu.: 0.000000 3rd Qu.: 0
## Max. :14.000000 Max. :9152545
## NA's :25 NA's :70272
## tot_cur_bal total_rev_hi_lim
## Min. : 0 Min. : 0
## 1st Qu.: 29853 1st Qu.: 13900
## Median : 80559 Median : 23700
## Mean : 139458 Mean : 32069
## 3rd Qu.: 208205 3rd Qu.: 39800
## Max. :8000078 Max. :9999999
## NA's :70272 NA's :70272
In our prediction model, we decided to predict loan status. We dichotomized loan status into “Good” and “Bad” based on the following criteria: Good 1. Fully Paid 2. Current 3. Does not meet the credit policy. Status:Fully Paid Bad 1. Default 2. Charged Off
3. Late (16-30 days)
4. Late (31-120 days)
5. In Grace Period 6. Does not meet the credit policy. Status:Charged Off
Since we are predicting loan status using our model, we decided to drop records with loan_status with values Issued:
query <- loan.dat$loan_status != 'Issued'
loan.dat <- loan.dat[query, ]
loan.dat$loan_status_bin <- "Bad"
query = loan.dat$loan_status == 'Fully Paid' | loan.dat$loan_status == 'Current' |
loan.dat$loan_status == 'Does not meet the credit policy. Status:Fully Paid'
loan.dat$loan_status_bin[query] = 'Good'
loan.dat$loan_status_bin = as.factor(loan.dat$loan_status_bin)
We also converted funded_amnt_inv to be percent funded amount by investors perc_funded_amnt_inv and only use the year from issue date (issue_d):
loan.dat <- loan.dat %>%
mutate(perc_funded_amnt_inv = funded_amnt_inv/funded_amnt,
issue_d = as.character(issue_d),
term = as.character(term)) %>%
mutate(year = as.numeric(str_sub(issue_d, start = -4)))
We reclassified a 36 month loan to “Short” and and 60 month loan to “Long”.
query <- loan.dat$term == ' 36 months'
loan.dat$term[query] = 'Short'
loan.dat$term[!query] = 'Long'
loan.dat$term = as.factor(loan.dat$term)
We also dichotomized tot_coll_amt to 0 and greater than 0 (and replaced the missing values to zero):
query.na <- is.na(loan.dat$tot_coll_amt)
if (sum(query.na) >0){
loan.dat$tot_coll_amt[query.na] = 0
}
loan.dat <- loan.dat %>%
mutate(tot_coll_amt_gt0 = as.factor(tot_coll_amt > 0.))
Based on examining the uni-variate plots and the correlation plot, we decided to keep the following 27 covariates: ### we can probably add a more detailed reason for dropping certain columns as outline in google docs
predictors <- c('loan_amnt', 'funded_amnt',
'int_rate', 'grade',
'emp_length', 'home_ownership',
'annual_inc', 'verification_status',
'purpose',
'addr_state', 'dti',
'delinq_2yrs', 'inq_last_6mths',
'open_acc', 'pub_rec',
'revol_bal', 'revol_util',
'total_acc', 'initial_list_status',
'application_type', 'acc_now_delinq',
'tot_coll_amt_gt0', 'tot_cur_bal',
'total_rev_hi_lim', 'perc_funded_amnt_inv',
'term', 'year')
predictors
## [1] "loan_amnt" "funded_amnt" "int_rate"
## [4] "grade" "emp_length" "home_ownership"
## [7] "annual_inc" "verification_status" "purpose"
## [10] "addr_state" "dti" "delinq_2yrs"
## [13] "inq_last_6mths" "open_acc" "pub_rec"
## [16] "revol_bal" "revol_util" "total_acc"
## [19] "initial_list_status" "application_type" "acc_now_delinq"
## [22] "tot_coll_amt_gt0" "tot_cur_bal" "total_rev_hi_lim"
## [25] "perc_funded_amnt_inv" "term" "year"
outcome <- c('loan_status_bin')
loan.dat <- loan.dat[, c(outcome, predictors)]
We also did a simple uni-variate analysis of the covariates using histograms and boxplots:
meta_loans <- funModeling::df_status(loan.dat, print_results = FALSE)
numeric_cols <- meta_loans$variable[meta_loans$type == 'numeric']
for (col_name in numeric_cols){
plt <- loan.dat %>% ggplot(aes(loan.dat[, col_name])) +
geom_histogram(color = "black") +
ggtitle(col_name) + labs(x=col_name)
print(plt)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 495 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 70272 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 70272 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
for (col_name in numeric_cols){
plt <- loan.dat %>% ggplot(aes(x=loan_status_bin, loan.dat[, col_name])) +
geom_boxplot() +
ggtitle(col_name) + labs(x='Loan Status', y=col_name)
print(plt)
}
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
## Warning: Removed 495 rows containing non-finite values (stat_boxplot).
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
## Warning: Removed 70272 rows containing non-finite values (stat_boxplot).
## Warning: Removed 70272 rows containing non-finite values (stat_boxplot).
We still have a few records with missing data. We initially tried imputing these values using the mice package but due to computational reasons, we decided to drop this idea.
meta_loans <- funModeling::df_status(loan.dat, print_results = FALSE)
meta_loans[order(-meta_loans$p_na),]
# loan.dat = mice(loan.dat, m=1) # let's just impute one dataset
# loan.dat = complete(loan.dat, 1)
Instead, we decided to set the missing values to the mean using the rest of the data:
sum(is.na(loan.dat$loan_status_bin))
## [1] 0
for (j in 1:ncol(loan.dat)){
miss = is.na(loan.dat[,j])
if (sum(miss) > 0){
loan.dat[miss, j] = mean(loan.dat[,j], na.rm=T)
}
}
sum(is.na(loan.dat))
## [1] 0
Here is a final basic summary statistics of the remaining covariates:
summary(loan.dat)
## loan_status_bin loan_amnt funded_amnt int_rate
## Bad : 67429 Min. : 500 Min. : 500 Min. : 5.32
## Good:811484 1st Qu.: 8000 1st Qu.: 8000 1st Qu.: 9.99
## Median :13000 Median :13000 Median :12.99
## Mean :14750 Mean :14737 Mean :13.25
## 3rd Qu.:20000 3rd Qu.:20000 3rd Qu.:16.20
## Max. :35000 Max. :35000 Max. :28.99
##
## grade emp_length home_ownership annual_inc
## A:146750 10+ years:288752 ANY : 3 Min. : 1200
## B:252006 2 years : 78167 MORTGAGE:439335 1st Qu.: 45000
## C:243387 < 1 year : 69828 NONE : 46 Median : 64800
## D:138356 3 years : 69344 OTHER : 182 Mean : 74996
## E: 70112 1 year : 56547 OWN : 86432 3rd Qu.: 90000
## F: 22852 5 years : 55194 RENT :352915 Max. :9500000
## G: 5450 (Other) :261081
## verification_status purpose addr_state
## Not Verified :263965 debt_consolidation:519418 CA :128370
## Source Verified:326722 credit_card :204110 NY : 73360
## Verified :288226 home_improvement : 51336 TX : 70445
## other : 42410 FL : 60328
## major_purchase : 17093 IL : 35178
## small_business : 10265 NJ : 32960
## (Other) : 34281 (Other):478272
## dti delinq_2yrs inq_last_6mths open_acc
## Min. : 0.00 Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 11.90 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 8.00
## Median : 17.64 Median : 0.0000 Median : 0.0000 Median :11.00
## Mean : 18.12 Mean : 0.3141 Mean : 0.6952 Mean :11.54
## 3rd Qu.: 23.93 3rd Qu.: 0.0000 3rd Qu.: 1.0000 3rd Qu.:14.00
## Max. :1092.52 Max. :39.0000 Max. :33.0000 Max. :90.00
##
## pub_rec revol_bal revol_util total_acc
## Min. : 0.0000 Min. : 0 Min. : 0.0 Min. : 1.00
## 1st Qu.: 0.0000 1st Qu.: 6446 1st Qu.: 37.7 1st Qu.: 17.00
## Median : 0.0000 Median : 11875 Median : 56.0 Median : 24.00
## Mean : 0.1948 Mean : 16912 Mean : 55.1 Mean : 25.26
## 3rd Qu.: 0.0000 3rd Qu.: 20823 3rd Qu.: 73.6 3rd Qu.: 32.00
## Max. :86.0000 Max. :2904836 Max. :892.3 Max. :169.00
##
## initial_list_status application_type acc_now_delinq
## f:456023 INDIVIDUAL:878468 Min. : 0.000000
## w:422890 JOINT : 445 1st Qu.: 0.000000
## Median : 0.000000
## Mean : 0.004994
## 3rd Qu.: 0.000000
## Max. :14.000000
##
## tot_coll_amt_gt0 tot_cur_bal total_rev_hi_lim perc_funded_amnt_inv
## FALSE:764171 Min. : 0 Min. : 0 Min. :0.0000
## TRUE :114742 1st Qu.: 32250 1st Qu.: 14700 1st Qu.:1.0000
## Median : 100278 Median : 25800 Median :1.0000
## Mean : 139394 Mean : 32035 Mean :0.9964
## 3rd Qu.: 195607 3rd Qu.: 37800 3rd Qu.:1.0000
## Max. :8000078 Max. :9999999 Max. :1.0000
##
## term year
## Long :263776 Min. :2007
## Short:615137 1st Qu.:2013
## Median :2014
## Mean :2014
## 3rd Qu.:2015
## Max. :2015
##
setwd('/Users/shuheimiyasaka/Google Drive/Harvard/Courses/BST 260/BST-260-Final-Project')
save(loan.dat, file = "./loan.dat.RData")
We decided to evaluate three models: 1. Ridge 2. Lasso 3. Elastic Net
In evaluating our model, we picked our final model based on the best test performance (i.e., largest AUC). We kept about a fifth of a data (200,000 records) for testing and used the rest for training our models. We trained our candidate models using 10-fold cross-validation (on the training set) to obtain the optimal tuning parameters and finally tested their performance on the test data set.
# This chunk takes a very very long time to run!!!
# I ran this overnight on my laptop and
# saved the results
# That's why eval is set to FALSE
setwd('/Users/shuheimiyasaka/Google Drive/Harvard/Courses/BST 260/BST-260-Final-Project')
load('loan.dat.Rdata')
# keep a fifth of the data set to assess test performance
set.seed(1)
test.indx <- sample(1:dim(loan.dat)[1], 200000, replace = FALSE)
train.indx <- setdiff(1:dim(loan.dat)[1], test.indx)
# this is to debug the code
# small.data.indx <- sample(1:dim(loan.dat)[1], 10000, replace = FALSE)
# loan.dat <- loan.dat[small.data.indx, ]
# test.indx <- sample(1:dim(loan.dat)[1], 500, replace = FALSE)
# train.indx <- setdiff(1:dim(loan.dat)[1], test.indx)
train.control = trainControl(method="repeatedcv", number=10, repeats=1,
classProbs=T, summaryFunction=twoClassSummary)
#models = c("ridge", "lasso", "enet", "adaboost")
models = c("ridge", "lasso", "enet")
n_models = length(models)
# test.indx2 <- sample(1:dim(loan.dat)[1], 10000, replace = FALSE)
# fit <- glm(loan_status_bin ~., family=binomial(link='logit'),
# data=loan.dat[test.indx2, ])
# summary(fit)
# x = model.matrix(loan_status_bin ~ ., loan.dat)[test.indx2,-1]
# y = loan.dat$loan_status_bin[test.indx2]
# fit.lasso <- cv.glmnet(x=x, y=y, family="binomial", alpha=1)
# fit.lasso
# coef.min = coef(fit.lasso, s = "lambda.min")
# coef.min
#
# fit.enet <- cv.glmnet(x=x, y=y, family="binomial", alpha=0.5)
# fit.enet
#
# fit.ridge <- cv.glmnet(x=x, y=y, family="binomial", alpha=0)
# fit.ridge
AUC = rep(0,n_models)
names(AUC) = models
for (m in 1:n_models) {
print_str <- paste("Training model: ",
models[m], sep='')
print(print_str)
# save our results
if (models[m] == 'ridge'){
fit = train(loan_status_bin ~.,
data=loan.dat[train.indx, ],
method="glmnet", metric="ROC", trControl=train.control,
tuneGrid=expand.grid(alpha = 0, lambda = .5 ^ (-20:20)))
fit.ridge <- fit
} else if (models[m] == 'lasso'){
fit = train(loan_status_bin ~.,
data=loan.dat[train.indx, ],
method="glmnet", metric="ROC", trControl=train.control,
tuneGrid=expand.grid(alpha = 1, lambda = .5 ^ (-20:20)))
fit.lasso <- fit
} else if (models[m] == 'enet'){
fit = train(loan_status_bin ~.,
data=loan.dat[train.indx, ],
method="glmnet", metric="ROC", trControl=train.control,
tuneGrid=expand.grid(alpha = seq(.05,.95,.05), lambda = .5 ^ (-20:20)))
fit.enet <- fit
} else if (models[m] == 'adaboost'){
fit = train(loan_status_bin ~.,
data=loan.dat[train.indx, ],
method=models[m], metric="ROC", trControl=train.control)
fit.adaboost <- fit
} else('Unknown model type!')
probs = predict(fit, loan.dat[test.indx,], type="prob")
R = roc(loan.dat$loan_status_bin[test.indx], probs$Good)
plot.roc(R, add=(m>1), col=m, lwd=2, main="ROC curves")
legend("bottomright", legend=models, col=1:n_models, lwd=2)
AUC[m] = R$auc
}
AUC
setwd('/Users/shuheimiyasaka/Google Drive/Harvard/Courses/BST 260/BST-260-Final-Project')
save(fit.ridge, fit.lasso, fit.enet, file = "./models.RData")
#save(fit.ridge, fit.lasso, fit.enet, fit.adaboost, file = "./models.RData")
## ridge lasso enet
## 0.7494317 0.7507364 0.7508311
The AUC values for all three models are similar. However, elastic net model \(\alpha=0.05\) and \(\lambda=0.0001220703\) had the largest AUC. Therefore, for our final prediction model, decided to use the elastic net model with \(\alpha=0.05\) and \(\lambda=0.0001220703\) and retrained the model with all data.
x = model.matrix(loan_status_bin ~ ., loan.dat)[,-1]
y = loan.dat$loan_status_bin
final_mod = glmnet(x=x, y=y, family = 'binomial', alpha = 0.05,
lambda = 0.0001220703)
#setwd('/Users/shuheimiyasaka/Google Drive/Harvard/Courses/BST 260/BST-260-Final-Project')
#save(final_mod, file = "./final_model.RData")
So our final model is an elastic net model (\(\alpha=0.05\) and \(\lambda=0.0001220703\)) with the following regression coefficients:
coef(final_mod)
## 109 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -7.755477e+02
## loan_amnt -9.019996e-06
## funded_amnt -1.372821e-06
## int_rate -2.158074e-01
## gradeB 9.012855e-02
## gradeC 3.026705e-01
## gradeD 6.032857e-01
## gradeE 9.440011e-01
## gradeF 1.378499e+00
## gradeG 1.607011e+00
## emp_length1 year 5.455825e-02
## emp_length10+ years 1.131587e-01
## emp_length2 years 7.302165e-02
## emp_length3 years 5.104954e-02
## emp_length4 years 7.584263e-02
## emp_length5 years 4.969832e-02
## emp_length6 years -1.961420e-02
## emp_length7 years 1.356539e-02
## emp_length8 years 4.307889e-02
## emp_length9 years 1.167799e-02
## emp_lengthn/a -1.202705e-01
## home_ownershipMORTGAGE 7.508743e-02
## home_ownershipNONE 1.218026e-01
## home_ownershipOTHER 2.321932e-01
## home_ownershipOWN 4.033432e-02
## home_ownershipRENT -4.767680e-02
## annual_inc 2.529573e-06
## verification_statusSource Verified -8.357074e-02
## verification_statusVerified -3.158425e-02
## purposecredit_card -6.107965e-02
## purposedebt_consolidation -1.529318e-01
## purposeeducational 1.332428e-01
## purposehome_improvement -1.524668e-01
## purposehouse -9.399033e-02
## purposemajor_purchase -8.485275e-02
## purposemedical -1.956414e-01
## purposemoving -1.636192e-01
## purposeother -8.853654e-02
## purposerenewable_energy -2.883932e-01
## purposesmall_business -5.102216e-01
## purposevacation -9.262761e-02
## purposewedding 2.826418e-01
## addr_stateAL -1.973950e-01
## addr_stateAR -5.919637e-02
## addr_stateAZ -1.210503e-01
## addr_stateCA -1.177515e-01
## addr_stateCO 1.088124e-01
## addr_stateCT 2.500888e-02
## addr_stateDC 4.654147e-01
## addr_stateDE -4.662514e-02
## addr_stateFL -1.833211e-01
## addr_stateGA -1.614537e-02
## addr_stateHI -1.610241e-01
## addr_stateIA -1.538244e-01
## addr_stateID 1.155041e+00
## addr_stateIL 1.102539e-01
## addr_stateIN -1.250537e-01
## addr_stateKS 1.497544e-01
## addr_stateKY -1.219469e-03
## addr_stateLA -1.542646e-01
## addr_stateMA -9.054951e-02
## addr_stateMD -1.479567e-01
## addr_stateME 2.801215e+00
## addr_stateMI -9.021531e-02
## addr_stateMN -1.043299e-01
## addr_stateMO -6.335870e-02
## addr_stateMS 6.428139e-05
## addr_stateMT 8.925641e-02
## addr_stateNC -1.578348e-01
## addr_stateND 1.508759e+00
## addr_stateNE 7.619997e-01
## addr_stateNH 1.847850e-01
## addr_stateNJ -1.405565e-01
## addr_stateNM -1.532665e-01
## addr_stateNV -2.880094e-01
## addr_stateNY -1.690205e-01
## addr_stateOH -1.725368e-02
## addr_stateOK -1.637056e-01
## addr_stateOR 6.320724e-04
## addr_statePA -6.956881e-02
## addr_stateRI -8.385937e-02
## addr_stateSC 1.253668e-01
## addr_stateSD -6.817322e-02
## addr_stateTN -1.476443e-01
## addr_stateTX 3.390241e-02
## addr_stateUT -1.342979e-01
## addr_stateVA -1.562151e-01
## addr_stateVT 2.305194e-01
## addr_stateWA 1.053958e-02
## addr_stateWI 3.220255e-02
## addr_stateWV 1.808913e-01
## addr_stateWY 2.795099e-01
## dti -1.011735e-02
## delinq_2yrs -3.198929e-02
## inq_last_6mths -8.881619e-02
## open_acc -3.714554e-03
## pub_rec 4.346689e-02
## revol_bal 2.975497e-07
## revol_util -1.001653e-03
## total_acc -1.530366e-04
## initial_list_statusw 1.095776e-01
## application_typeJOINT 2.266379e+00
## acc_now_delinq -1.869035e-02
## tot_coll_amt_gt0TRUE -5.309220e-03
## tot_cur_bal 2.643953e-07
## total_rev_hi_lim 2.053354e-06
## perc_funded_amnt_inv -5.164081e-01
## termShort -1.204291e-01
## year 3.882085e-01